###############################################################################
# This file provides code for  
#
# Moore, Ryan T. and Nirmala Ravishankar. ``Who Loses in Direct Democracy?''.  
# Social Science Research, 41(3):646-656, May 2012.
#
# Performs 2002 - 2004 post-imputation analyses
###############################################################################

library(MASS)

source("loadData0006.R")
source("imputeData0006.R")

print(dim(lat.mi$imputations[[1]]))
## [1] 48901    15
print(length(unique(lat.na$prop)))
## [1] 14
print(48901/14)
print(c(min(lat.na$year), max(lat.na$year)))

## calculate fraction in each category:
factorSummary <- function(x){
 for(i in 1:ncol(x)){
 	cat(names(x)[i], "\n")
 	print(round(summary(as.factor(x[,i]))/(sum(summary(as.factor(x[,i]))) - sum(summary(as.factor(lat.na[,i]))[grep("NA", names(summary(as.factor(lat.na[,i]))))])), 3))
 }	
}

## For NA data ##
## Summary stats for Available cases:
factorSummary(lat.na)
## Fraction missing cells:
## Define NA counting function
nasum <- function(x){   
  return(sum(is.na(x)))
}
wh.ind <- c("gender", "race", "ideology", "party", "age", "educ", "income", "winner")
ind.dat <- lat.na[, wh.ind]
print(round(sum(apply(ind.dat, 2, nasum))/(nrow(ind.dat)*ncol(ind.dat)), 3))
## [1] 0.067   prop of cells missing.
rm(wh.ind, ind.dat)

## Fraction missing at least one cell by race
varMissByValue <- function(x, var){
	prop.na <- NULL
	vals <- sort(unique(x[, var]))
	for(i in 1:length(vals)){
		sub <- x[x[, var] %in% vals[i], ]
		isna <- t(apply(sub, 1, is.na))
		isna.sum.vec <- apply(isna, 1, sum)
		prop.na[i] <- mean(isna.sum.vec > 0) 
	}
	cat(var, "\n")
	print(cbind(vals, prop.na))
}

## Win/Lose missingness overall
varMissByValue(lat.na, "winner")

## Win/Lose missingness by race
varMissByValue(lat.na, "race")

## 'Replicate' HGL Table 1
t1 <- glm(winner ~ as.factor(region) + as.factor(gender) + as.factor(race) + ideology + as.factor(party) + as.factor(age) + as.factor(educ) + as.factor(income) + margin, data = lat.na, family = binomial(logit))

## How many obs?
nrow(lat.na)-10862 ## 38039

## so, 7% of cells are missing, but 22% of observations lost to LD.
10862/nrow(lat.na)  ##[1] 0.2221222

## Simulate 5000 coefficients
## Create hypothetical voter profile
## Create fitted values
## Create 95\% CI's
## Create first differences: white - black, wh-hi, wh-as

fd.func <- function(dat, nsims = 5000, imputed.list = FALSE, rounder = 3, seed = NULL, reg.val = rep(0,4), gen.val = 1, race.wh = rep(0, 4), race.bl = c(1,0,0,0), race.hi = c(0,1,0,0), race.as = c(0,0,1,0), ideo.val = 2, part.val = c(0,1,0), age.val = c(0,1,0,0), educ.val = c(0,1,0,0,0), inc.val = c(0,1,0,0,0), margin = 26, fml = (winner ~ as.factor(region) + as.factor(gender) + as.factor(race) + ideology + as.factor(party) + as.factor(age) + as.factor(educ) + as.factor(income) + margin)){
	
	quant.func <- function(x){quantile(x, c(.025, .975))}
	
	if(imputed.list == FALSE){
		length.unique.margin <- length(unique(dat$margin))
		tmp <- glm(fml, data = dat, family = binomial(logit))
		
		if(!is.null(seed)){
			set.seed(seed)	
		}
		beta.sim <- mvrnorm(nsims, tmp$coefficients, summary(tmp)$cov.unscaled)
	}
	
	if(imputed.list == TRUE){
		D <- length(dat)
		length.unique.margin <- length(unique(dat[[1]]$margin))

		for(j in 1:D){
			tmp <- glm(fml, data = dat[[j]], family = binomial(logit))
			if(j == 1){
				beta.store <- se.store <- matrix(NA, D, length(tmp$coefficients))
			}
			beta.store[j,] <- tmp$coefficients
    		colnames(beta.store) <- names(tmp$coefficients)
    		se.store[j,] <- (summary(tmp)$coef)[,2]
    	}
  		b <- apply(beta.store, 2, mean)
  		var.w <- apply((se.store)^2, 2, mean)
  		quadforms <- matrix(NA, D, length(tmp$coefficients))
  		for(k in 1:D){
    		quadforms[k,] <- (beta.store[k,]-b)^2
  		}
  		var.b <- (1/(D-1))*apply(quadforms,2, sum)
	  se <- sqrt(var.w + ((D+1)/D)*var.b)

		if(!is.null(seed)){
			set.seed(seed)	
		}
		beta.sim <- mvrnorm(nsims, b, diag(se)^2)
	}
	
	## hyp: intercept, reg=LA, gender=m, RACE, ideo=2, party=I, age=30-44, educ=some col, inc=40-59k, margin=26
	white <- c(1, reg.val, gen.val, race.wh, ideo.val, part.val, age.val, educ.val, inc.val)
	black <- c(1, reg.val, gen.val, race.bl, ideo.val, part.val, age.val, educ.val, inc.val)
	latino <- c(1, reg.val, gen.val, race.hi, ideo.val, part.val, age.val, educ.val, inc.val)
	asian <- c(1, reg.val, gen.val, race.as, ideo.val, part.val, age.val, educ.val, inc.val)
	
	if(length.unique.margin > 1){
		white <- append(white, margin)
		black <- append(black, margin)
		latino <- append(latino, margin)
		asian <- append(asian, margin)
	}
	
	hyp.covars <- matrix(rbind(white, black, latino, asian), nrow = 4)
	colnames(hyp.covars) <- names(coef(tmp))
	rownames(hyp.covars) <- c("white", "black", "latino", "asian")
	
	fits <- 1/(1+exp(-hyp.covars%*%t(beta.sim)))
	print(round(apply(fits, 1, mean), rounder))
	print(round(apply(fits, 1, quant.func), rounder))
	## First Differences
	first.diffs <- matrix(NA, 4, nsims)
	rownames(first.diffs) <- c("white", "black", "latino", "asian")
	for(i in 1:nrow(fits)){
  		first.diffs[i,] <- fits[i,]-fits[1,]
	}
	print(round(apply(first.diffs,1,mean), rounder))
	print(round(apply(first.diffs,1,quant.func), rounder))
	return(list(fits = fits, first.diffs = first.diffs))
}

## Table 7, columns 1 and 2:
## For LD data ##
na.out <- fd.func(lat.na, nsims=10000, rounder = 4, seed = 439)

## Table 7, columns 3 and 4:
## For MI data ##
mi.out <- fd.func(lat.mi$imputations, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 426)


rankfits <- function(list){
	lll <- length(list)
	tmp.vec <- array(NA)
	for(i in 1:lll){
		tmp.vec <- append(tmp.vec, apply(list[[i]]$fits, 1, mean))	
		names(tmp.vec)[(4*i-3):(4*i)] <- paste(names(tmp.vec)[(4*i-3):(4*i)], names(tmp.list)[i], sep="")
	}
	tmp.vec <- tmp.vec[2:length(tmp.vec)]
	print(round(sort(tmp.vec, dec=TRUE), 3))
	cat("\n")
	print(round(sort(tmp.vec-tmp.vec[1], dec=TRUE), 3))
}

lat.na.nmt <- lat.na[lat.na$type != "Race",]

## For LD data ##
## Table 6, columns 1 and 2:
na.nmt.out <- fd.func(lat.na.nmt, nsims=10000, rounder = 4, seed = 416)

sublist <- function(dat.list, var = NULL, cut = NULL, keep = NULL){
	subl <- list()
	for(i in 1:length(dat.list)){
		if(!is.null(cut)){
			subl[[i]] <- dat.list[[i]][(!(dat.list[[i]][, var] %in% cut)), ]
		}
		if(!is.null(keep)){
			subl[[i]] <- dat.list[[i]][(dat.list[[i]][, var] %in% keep), ]
		}
	}
	return(subl)	
}

lat.mi.nmt <- sublist(lat.mi$imputations, var = "type", cut = "Race")

## For MI data ##
## Table 6, columns 3 and 4:
mi.nmt.out <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 405)

## For all income levels:
mi.out.20 <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 739, inc.val = c(0,0,0,0,0)) 
mi.out.39 <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 739, inc.val = c(1,0,0,0,0))
mi.out.59 <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 739, inc.val = c(0,1,0,0,0))
mi.out.74 <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 739, inc.val = c(0,0,1,0,0))
mi.out.100 <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 739, inc.val = c(0,0,0,1,0))
mi.out.rich <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 739, inc.val = c(0,0,0,0,1))
tmp.list <- list(poor = mi.out.20, l40=mi.out.39, l60=mi.out.59, l75=mi.out.74, l100=mi.out.100, rich = mi.out.rich)
rm(mi.out.20, mi.out.39, mi.out.59, mi.out.74, mi.out.100, mi.out.rich)
rankfits(tmp.list)

## For all education levels:
mi.out.nohs <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 768, educ.val = c(0,0,0,0,0)) 
mi.out.hs <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 768, educ.val = c(1,0,0,0,0))
mi.out.col <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 768, educ.val = c(0,1,0,0,0))
mi.out.ba <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 768, educ.val = c(0,0,1,0,0))
mi.out.grad <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 768, educ.val = c(0,0,0,1,0))
mi.out.ma <- fd.func(lat.mi.nmt, nsims=10000, imputed.list = TRUE, rounder = 4, seed = 768, educ.val = c(0,0,0,0,1))
tmp.list <- list(nohs=mi.out.nohs, hs=mi.out.hs, col=mi.out.col, ba=mi.out.ba, grad=mi.out.grad, ma=mi.out.ma)
rm(mi.out.nohs, mi.out.hs, mi.out.col, mi.out.ba, mi.out.grad, mi.out.ma)
rankfits(tmp.list)

## process for xtable:
processXtable <- function(fd.out.na, fd.out.mi){
	
	quant.func <- function(x){quantile(x, c(.025, .975))}
	storage <- matrix(NA, 8, 4)
	fd.out.na$fits <- round(fd.out.na$fits, 3)
	fd.out.na$first.diffs <- round(fd.out.na$first.diffs, 3)
	fd.out.mi$fits <- round(fd.out.mi$fits, 3)
	fd.out.mi$first.diffs <- round(fd.out.mi$first.diffs, 3)
	
	storage[c(1,3,5,7), 1] <- round(apply(fd.out.na$fits, 1, mean), 3)
	storage[c(1,3,5,7), 2] <- round(apply(fd.out.na$first.diffs, 1, mean), 3)
	storage[c(1,3,5,7), 3] <- round(apply(fd.out.mi$fits, 1, mean), 3)
	storage[c(1,3,5,7), 4] <- round(apply(fd.out.mi$first.diffs, 1, mean), 3)
	storage[c(2,4,6,8), 1] <- paste("(", apply(fd.out.na$fits, 1, quantile, .025), ", ", apply(fd.out.na$fits, 1, quantile, .975), ")", sep="")
	storage[c(2,4,6,8), 2] <- paste("(", apply(fd.out.na$first.diffs, 1, quantile, .025), ", ", apply(fd.out.na$first.diffs, 1, quantile, .975), ")", sep="")
	storage[c(2,4,6,8), 3] <- paste("(", apply(fd.out.mi$fits, 1, quantile, .025), ", ", apply(fd.out.mi$fits, 1, quantile, .975), ")", sep="")
	storage[c(2,4,6,8), 4] <- paste("(", apply(fd.out.mi$first.diffs, 1, quantile, .025), ", ", apply(fd.out.mi$first.diffs, 1, quantile, .975), ")", sep="")	
	
	storage <- storage[c(3:8, 1:2), ]
	storage[7:8, 2] <- ""
	storage[7:8, 4] <- ""
	return(storage)
}

## Table 7
t.0204all <- processXtable(na.out, mi.out)

## Table 6
t.0204nmt <- processXtable(na.nmt.out, mi.nmt.out)


## runxtable
library(xtable)
runxtable <- function(obj, cap, lab, file){
	tmp <- xtable(obj, caption= cap, label=lab, align=rep("c", 5)) 
	rownames(tmp) <- c("Blacks", "b", "Latinos", "l",
                          "Asians", "a", "Whites", "w")
  colnames(tmp) <- rep(c("Prob of Winning", "Difference"),2)
  print(tmp, file = file, caption.placement = "bottom")
}

runxtable(t.0204all, "Mean and 95\\% Confidence Intervals for Predicted Probabilities and First Differences in All Propositions, 2002-2004.  Negative difference estimates indicate disadvantage relative to whites.", lab = "t:0204all", "~/Desktop/t0204all.tex")

runxtable(t.0204nmt, "Mean and 95\\% Confidence Intervals for Predicted Probabilities and First Differences in Non-Minority Targeted Propositions, 2002-2004.  Negative difference estimates indicate disadvantage relative to whites.", lab = "t:0204nmt", file = "~/Desktop/t0204nmt.tex")
